{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "import requests\n",
    "import numpy as np\n",
    "import random\n",
    "\n",
    "data_dir = Path(\"data\")\n",
    "data_dir.mkdir(exist_ok=True)\n",
    "\n",
    "templates = [\n",
    "    \"{problem}\",\n",
    "    \"{problem}\",\n",
    "    \"{problem}\",\n",
    "    \"{problem}\",\n",
    "    \"{problem}\",\n",
    "    \"\"\"Solve the following math problem: {problem}\"\"\",\n",
    "    \"\"\"Provide a step by step solution for the following math problem: {problem}\"\"\",\n",
    "    \"\"\"{problem}\n",
    "How to solve this?\"\"\",\n",
    "    \"\"\"{problem}\n",
    "Can you solve this problem?\"\"\",\n",
    "    \"\"\"I need help with this problem:\n",
    "{problem}\"\"\",\n",
    "    \"\"\"{problem}\n",
    "What is the solution?\"\"\",\n",
    "    \"\"\"{problem}\n",
    "Give me a solution to this problem\"\"\",\n",
    "    \"\"\"{problem}\n",
    "Solve it. \"\"\",\n",
    "    \"\"\"{problem}\n",
    "Solve this problem. \"\"\",\n",
    "    \"\"\"{problem}\n",
    "Find the solution. \"\"\",\n",
    "    \"\"\"{problem}\n",
    "Give me a solution to this problem\"\"\",\n",
    "    \"\"\"Solve the math problem: {problem}\"\"\",\n",
    "    \"\"\"Find the answer to this math problem: {problem}\"\"\",\n",
    "    \"\"\"Explain how to solve this math problem: {problem}\"\"\",\n",
    "    \"\"\"{problem}\n",
    "Work out the solution step by step. \"\"\",\n",
    "    \"\"\"{problem}\n",
    "Give me a detailed solution. \"\"\",\n",
    "    \"\"\"Find a solution for this math problem: {problem}\"\"\",\n",
    "    \"\"\"Break down this math problem: {problem}\"\"\",\n",
    "    \"\"\"{problem}\n",
    "Give me a clear explanation. \"\"\",\n",
    "    \"Find the answer to the math problem: {problem}\",\n",
    "    \"Can you explain how to solve this math problem: {problem}\",\n",
    "    \"Please show me the solution for: {problem}\",\n",
    "    \"\"\"I'm stuck on this math problem: {problem}\n",
    "Can you help?\"\"\",\n",
    "    \"Can you guide me through solving this problem: {problem}\",\n",
    "    \"I need a clearer understanding of how to solve: {problem}\",\n",
    "    \"Can you walk me through the solution of: {problem}\",\n",
    "    \"Can you provide an in-depth solution for: {problem}\"\n",
    "    \"Hey there, could you help me solve this math problem: {problem}\",\n",
    "    \"Can you give me some step-by-step instructions for this math problem: {problem}\",\n",
    "    \"\"\"I'm completely lost with this math problem: {problem}\n",
    "Can you give me a hand?\"\"\",\n",
    "    \"\"\"This math problem has got me stumped: {problem}\n",
    "Can you show me the way?\"\"\",\n",
    "    \"\"\"I would love to understand how to solve this problem: {problem}\n",
    "Can you explain?\"\"\",\n",
    "    \"Can you break down the solution for me for this math problem: {problem}\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_original(name):\n",
    "    with requests.get(\n",
    "        f\"https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/{name}\"\n",
    "    ) as response:\n",
    "        with open(data_dir / name, \"w\") as f:\n",
    "            f.write(response.text)\n",
    "\n",
    "\n",
    "def load_df(name):\n",
    "    with open(data_dir / name) as f:\n",
    "        df = pd.read_json(f, lines=True)\n",
    "\n",
    "    return pd.DataFrame(\n",
    "        {\n",
    "            \"INSTRUCTION\": df.apply(lambda x: np.random.choice(templates).format(problem=x[\"question\"]), axis=1),\n",
    "            \"RESPONSE\": df[\"answer\"].str.replace(r\"<<.*>>|\\n####.*\", \"\"),\n",
    "            \"SOURCE\": \"grade-school-math\",\n",
    "        }\n",
    "    )\n",
    "\n",
    "\n",
    "def save_result(df, name):\n",
    "    df.to_parquet(data_dir / f\"{name.split('.')[0]}.parquet\", row_group_size=100, engine=\"pyarrow\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataset_names = [\"train.jsonl\", \"test.jsonl\", \"train_socratic.jsonl\", \"test_socratic.jsonl\"]\n",
    "dataset_names = [\"train.jsonl\", \"test.jsonl\"]\n",
    "for name in dataset_names:\n",
    "    download_original(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.concat([load_df(name) for name in dataset_names], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>INSTRUCTION</th>\n",
       "      <th>RESPONSE</th>\n",
       "      <th>SOURCE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>This math problem has got me stumped: Natalia ...</td>\n",
       "      <td>Natalia sold 48/2 = 24 clips in May.\\nNatalia ...</td>\n",
       "      <td>grade-school-math</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Weng earns $12 an hour for babysitting. Yester...</td>\n",
       "      <td>Weng earns 12/60 = $0.2 per minute.\\nWorking 5...</td>\n",
       "      <td>grade-school-math</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>I'm completely lost with this math problem: Be...</td>\n",
       "      <td>In the beginning, Betty has only 100 / 2 = $50...</td>\n",
       "      <td>grade-school-math</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Explain how to solve this math problem: Julie ...</td>\n",
       "      <td>Maila read 12 x 2 = 24 pages today.\\nSo she wa...</td>\n",
       "      <td>grade-school-math</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>I need a clearer understanding of how to solve...</td>\n",
       "      <td>He writes each friend 3*2=6 pages a week\\nSo h...</td>\n",
       "      <td>grade-school-math</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8787</th>\n",
       "      <td>John had a son James when he was 19.  James is...</td>\n",
       "      <td>Dora is 12-3=9\\nSo James is 9*2=18 years old\\n...</td>\n",
       "      <td>grade-school-math</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8788</th>\n",
       "      <td>Solve the following math problem: There are so...</td>\n",
       "      <td>There are 60 minutes in an hour. Ana peels an ...</td>\n",
       "      <td>grade-school-math</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8789</th>\n",
       "      <td>Can you provide an in-depth solution for: Mark...</td>\n",
       "      <td>The discount on the radiator was 400*.8=$320\\n...</td>\n",
       "      <td>grade-school-math</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8790</th>\n",
       "      <td>Farmer Brown has 20 animals on his farm, all e...</td>\n",
       "      <td>Let C be the number of chickens.\\nThere are 20...</td>\n",
       "      <td>grade-school-math</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8791</th>\n",
       "      <td>Please show me the solution for: Henry and 3 o...</td>\n",
       "      <td>There are 7*8=56 slices in total.\\nThere are 1...</td>\n",
       "      <td>grade-school-math</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8792 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            INSTRUCTION  \\\n",
       "0     This math problem has got me stumped: Natalia ...   \n",
       "1     Weng earns $12 an hour for babysitting. Yester...   \n",
       "2     I'm completely lost with this math problem: Be...   \n",
       "3     Explain how to solve this math problem: Julie ...   \n",
       "4     I need a clearer understanding of how to solve...   \n",
       "...                                                 ...   \n",
       "8787  John had a son James when he was 19.  James is...   \n",
       "8788  Solve the following math problem: There are so...   \n",
       "8789  Can you provide an in-depth solution for: Mark...   \n",
       "8790  Farmer Brown has 20 animals on his farm, all e...   \n",
       "8791  Please show me the solution for: Henry and 3 o...   \n",
       "\n",
       "                                               RESPONSE             SOURCE  \n",
       "0     Natalia sold 48/2 = 24 clips in May.\\nNatalia ...  grade-school-math  \n",
       "1     Weng earns 12/60 = $0.2 per minute.\\nWorking 5...  grade-school-math  \n",
       "2     In the beginning, Betty has only 100 / 2 = $50...  grade-school-math  \n",
       "3     Maila read 12 x 2 = 24 pages today.\\nSo she wa...  grade-school-math  \n",
       "4     He writes each friend 3*2=6 pages a week\\nSo h...  grade-school-math  \n",
       "...                                                 ...                ...  \n",
       "8787  Dora is 12-3=9\\nSo James is 9*2=18 years old\\n...  grade-school-math  \n",
       "8788  There are 60 minutes in an hour. Ana peels an ...  grade-school-math  \n",
       "8789  The discount on the radiator was 400*.8=$320\\n...  grade-school-math  \n",
       "8790  Let C be the number of chickens.\\nThere are 20...  grade-school-math  \n",
       "8791  There are 7*8=56 slices in total.\\nThere are 1...  grade-school-math  \n",
       "\n",
       "[8792 rows x 3 columns]"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Can you provide an in-depth solution for: Ivan had $10 and spent 1/5 of it on cupcakes. He then spent some money on a milkshake and had only $3 left. How much is the milkshake?Hey there, could you help me solve this math problem: Ivan had $10 and spent 1/5 of it on cupcakes. He then spent some money on a milkshake and had only $3 left. How much is the milkshake?\n",
      "\n",
      "Ivan spent a total of $10 - $3 = $7 on cupcakes and a milkshake.\n",
      "The cost of the cupcake is $10 x 1/5 = $2.\n",
      "So, $7 - $2 = $5 was spent on the milkshake.\n"
     ]
    }
   ],
   "source": [
    "ind = random.randint(0, len(df))\n",
    "print(df.iloc[ind][\"INSTRUCTION\"])\n",
    "print()\n",
    "print(df.iloc[ind][\"RESPONSE\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_parquet(str(data_dir / \"output.parquet\"), row_group_size=100, engine=\"pyarrow\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "ds = Dataset.from_parquet(str((data_dir / \"output.parquet\").absolute()))\n",
    "ds.push_to_hub(\"qwedsacf/grade-school-math-instructions\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "721c9609fa002bad4d3b9b67e869ef29c074aa6b5eebcc2ec10b0e8711444481"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
